library(dplyr)
library(ggplot2)
# 5-8
library(readr)
library(tidyverse)
library(correlationfunnel)
library(DataExplorer)
library(WVPlots)
library(ggthemes)
library(ROCR)
library(caret)
library(corrplot)
library(gridExtra)
Load the dataset as below:
data <- read.csv("KAG.csv", stringsAsFactors = FALSE)
str(data)
## 'data.frame': 1143 obs. of 15 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ ad_id : int 708746 708749 708771 708815 708818 708820 708889 708895 708953 708958 ...
## $ campaign_id : int 916 916 916 916 916 916 916 916 916 916 ...
## $ age : int 32 32 32 32 32 32 32 32 32 32 ...
## $ gender : int 0 0 0 0 0 0 0 0 0 0 ...
## $ interest : int 15 16 20 28 28 29 15 16 27 28 ...
## $ Impressions : int 7350 17861 693 4259 4133 1915 15615 10951 2355 9502 ...
## $ Clicks : int 1 2 0 1 1 0 3 1 1 3 ...
## $ Spent : num 1.43 1.82 0 1.25 1.29 ...
## $ Total_Conversion : int 2 2 1 1 1 1 1 1 1 1 ...
## $ Approved_Conversion : int 1 0 0 0 1 1 0 1 0 0 ...
## $ CTR : num 0.0136 0.0112 0 0.0235 0.0242 0 0.0192 0.0091 0.0425 0.0316 ...
## $ CPC : num 1.43 0.91 0 1.25 1.29 ...
## $ CostPerConv_Total : num 0.715 0.91 0 1.25 1.29 0 4.77 1.27 1.5 3.16 ...
## $ CostPerConv_Approved: num 1.43 1.82 0 1.25 1.29 ...
colnames(data)
## [1] "X" "ad_id" "campaign_id"
## [4] "age" "gender" "interest"
## [7] "Impressions" "Clicks" "Spent"
## [10] "Total_Conversion" "Approved_Conversion" "CTR"
## [13] "CPC" "CostPerConv_Total" "CostPerConv_Approved"
summary(data)
## X ad_id campaign_id age
## Min. : 1.0 Min. : 708746 Min. : 916 Min. :32.00
## 1st Qu.: 286.5 1st Qu.: 777632 1st Qu.: 936 1st Qu.:32.00
## Median : 572.0 Median :1121185 Median :1178 Median :37.00
## Mean : 572.0 Mean : 987261 Mean :1067 Mean :38.32
## 3rd Qu.: 857.5 3rd Qu.:1121804 3rd Qu.:1178 3rd Qu.:42.00
## Max. :1143.0 Max. :1314415 Max. :1178 Max. :47.00
## gender interest Impressions Clicks
## Min. :0.0000 Min. : 2.00 Min. : 87 Min. : 0.00
## 1st Qu.:0.0000 1st Qu.: 16.00 1st Qu.: 6504 1st Qu.: 1.00
## Median :0.0000 Median : 25.00 Median : 51509 Median : 8.00
## Mean :0.4821 Mean : 32.77 Mean : 186732 Mean : 33.39
## 3rd Qu.:1.0000 3rd Qu.: 31.00 3rd Qu.: 221769 3rd Qu.: 37.50
## Max. :1.0000 Max. :114.00 Max. :3052003 Max. :421.00
## Spent Total_Conversion Approved_Conversion CTR
## Min. : 0.00 Min. : 0.000 Min. : 0.000 Min. :0.00000
## 1st Qu.: 1.48 1st Qu.: 1.000 1st Qu.: 0.000 1st Qu.:0.01005
## Median : 12.37 Median : 1.000 Median : 1.000 Median :0.01600
## Mean : 51.36 Mean : 2.856 Mean : 0.944 Mean :0.01642
## 3rd Qu.: 60.02 3rd Qu.: 3.000 3rd Qu.: 1.000 3rd Qu.:0.02340
## Max. :639.95 Max. :60.000 Max. :21.000 Max. :0.10590
## CPC CostPerConv_Total CostPerConv_Approved
## Min. :0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.:1.234 1st Qu.: 1.390 1st Qu.: 1.475
## Median :1.450 Median : 8.332 Median : 11.010
## Mean :1.228 Mean : 16.222 Mean : 33.628
## 3rd Qu.:1.600 3rd Qu.: 21.560 3rd Qu.: 41.675
## Max. :2.212 Max. :332.990 Max. :541.700
data %>%
filter (CPC == min(CPC)) %>%
filter (Impressions == max(Impressions)) %>%
select(ad_id, CPC, Impressions)
## ad_id CPC Impressions
## 1 1121094 0 24362
Answer: ad_id = 1121094
data %>%
group_by(campaign_id) %>%
summarise_each(sum, Spent, Impressions) %>%
mutate(Efficiency = Spent/Impressions*1000) %>%
arrange(desc(Efficiency))
## # A tibble: 3 x 4
## campaign_id Spent Impressions Efficiency
## <int> <dbl> <int> <dbl>
## 1 936 2893. 8128187 0.356
## 2 916 150. 482925 0.310
## 3 1178 55662. 204823716 0.272
Answer: Campaign_id 936 had spent least efficiently on brand awareneww on an average
df_3 <- data %>%
filter(interest %in% c(15,21,101), Spent !=0) %>%
mutate(ROAS = round((5*Total_Conversion + 50*Approved_Conversion)/Spent,2)) %>%
select(interest, gender, ROAS) %>%
mutate(gender = as.factor(gender)) %>%
mutate(interest = as.factor(interest))
ggplot(data=df_3, mapping = aes(x=interest, y=ROAS, fill=gender))+
geom_boxplot()+
scale_y_log10()+
xlab('Interest ID')+
ylab('ROAS')+
ggtitle('ROAS By Gender')+
theme(plot.title = element_text(hjust = 0.5), legend.position = 'bottom')
data %>%
filter(campaign_id == 1178, Spent != 0) %>%
mutate(ROAS = round((5*Total_Conversion + 50*Approved_Conversion)/Spent,2)) %>%
mutate(gender = as.factor(gender)) %>%
select(campaign_id, gender,ROAS) %>%
group_by(gender) %>%
summarize(mean_ROAS = mean(ROAS), median_ROAS = median(ROAS))
## # A tibble: 2 x 3
## gender mean_ROAS median_ROAS
## <fct> <dbl> <dbl>
## 1 0 2.55 1.19
## 2 1 1.58 0.73
Load the advertising1 (Links to an external site.) dataset using readr.
df_ad = read_csv('advertising1.csv', col_names=TRUE, col_types='nnnnccfcTf')
str(df_ad)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 1000 obs. of 10 variables:
## $ Daily Time Spent on Site: num 69 80.2 69.5 74.1 68.4 ...
## $ Age : num 35 31 26 29 35 23 33 48 30 20 ...
## $ Area Income : num 61834 68442 59786 54806 73890 ...
## $ Daily Internet Usage : num 256 194 236 246 226 ...
## $ Ad Topic Line : chr "Cloned 5thgeneration orchestration" "Monitored national standardization" "Organic bottom-line service-desk" "Triple-buffered reciprocal time-frame" ...
## $ City : chr "Wrightburgh" "West Jodi" "Davidton" "West Terrifurt" ...
## $ Male : Factor w/ 2 levels "0","1": 1 2 1 2 1 2 1 2 2 2 ...
## $ Country : chr "Tunisia" "Nauru" "San Marino" "Italy" ...
## $ Timestamp : POSIXct, format: "2016-03-27 00:53:11" "2016-04-04 01:39:02" ...
## $ Clicked on Ad : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 1 1 ...
## - attr(*, "spec")=
## .. cols(
## .. `Daily Time Spent on Site` = col_number(),
## .. Age = col_number(),
## .. `Area Income` = col_number(),
## .. `Daily Internet Usage` = col_number(),
## .. `Ad Topic Line` = col_character(),
## .. City = col_character(),
## .. Male = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
## .. Country = col_character(),
## .. Timestamp = col_datetime(format = ""),
## .. `Clicked on Ad` = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE)
## .. )
# Histogram of continuous variables Age
p1 <- ggplot(data=df_ad, aes(x=Age))+
geom_histogram(bins=20)+
ggtitle('Histogram of Age')+
theme(text = element_text(size=8))
# Histogram of continuous variable Area Income
p2 <- ggplot(data=df_ad, aes(x=`Area Income`))+
geom_histogram(bins=20)+
ggtitle('Histogram of Area Income')+
theme(text = element_text(size=8))
# Histogram of continuous variables Daily Time Spent on Site
p3 <- ggplot(data=df_ad, aes(x=`Daily Time Spent on Site`))+
geom_histogram(bins=20)+
ggtitle('Histogram of Daily Time Spent on Site')+
theme(text = element_text(size=8))
# Histogram of continuous variable Area Income
p4 <- ggplot(data=df_ad, aes(x=`Daily Internet Usage`))+
geom_histogram(bins=20)+
ggtitle('Histogram of Daily Internet Usage')+
theme(text = element_text(size=8))
grid.arrange(p1, p2, p3, p4, nrow=2, ncol=2)
p5 <- ggplot(data=df_ad, aes(x=Male))+
geom_bar()+
ggtitle('Bar Chart of Gender')
p6 <- ggplot(data=df_ad, aes(x=`Clicked on Ad`))+
geom_bar()+
ggtitle('Bar Chart of Clicked On Ad')
grid.arrange(p5, p6, nrow=1, ncol=2)
pc1 <- ggplot(data=df_ad, aes(y=Age, x=`Clicked on Ad`))+
geom_boxplot()+
ggtitle('Boxplot of Age')+
theme(text = element_text(size=8))
pc2 <- ggplot(data=df_ad, aes(y=`Area Income`, x=`Clicked on Ad`))+
geom_boxplot()+
ggtitle('Boxplot of Area Income')+
theme(text = element_text(size=8))
pc3 <- ggplot(data=df_ad, aes(y=`Daily Internet Usage`, x=`Clicked on Ad`))+
geom_boxplot()+
ggtitle('Boxplot of Daily Internet Usage')+
theme(text = element_text(size=8))
pc4 <- ggplot(data=df_ad, aes(y=`Daily Time Spent on Site`, x=`Clicked on Ad`))+
geom_boxplot()+
ggtitle('Boxplot of Daily Time Spent on Site')+
theme(text = element_text(size=8))
grid.arrange(pc1, pc2, pc3, pc4, nrow=2, ncol=2)
Answer: Yes, from the box plot of Age by Clicked.on.Ad, older person is more likely to click on the ad than younger person.
Part (a) [3 points] 1. Make a scatter plot for Area.Income against Age. Separate the datapoints by different shapes based on if the datapoint has clicked on the ad or not.
ggplot(df_ad, aes(x=Age, y=`Area Income`,shape=`Clicked on Ad`,color=`Clicked on Ad`))+
geom_point()
Answer: No, based on the plot that area is more likely has no click.
Part (b) [3 points]
ggplot(df_ad, aes(x=Age, y=`Daily Time Spent on Site`,shape=`Clicked on Ad`,color=`Clicked on Ad`))+
geom_point()
Answer: Yes, 50 year old and 60 mins is more likely to click.
Part (a) 1. Now that we have done some exploratory data analysis to get a better understanding of our raw data, we can begin to move towards designing a model to predict advert clicks. 2. Generate a correlation funnel (using the correlation funnel package) to see which of the variable in the dataset have the most correlation with having clicked the advert. NOTE: Here we are creating the correlation funnel in regards to HAVING clicked the advert, rather than not. This will lead to a minor distinction in your code between the 2 cases. However, it will not affect your results and subsequent variable selection.
names(df_ad) <- gsub(' ','.',names(df_ad))
df_ad %>%
#select(-c(Ad.Topic.Line,City,Country,Timestamp)) %>%
select(-Timestamp) %>%
mutate_if(is.numeric,as.numeric) %>%
binarize() %>%
correlate(Clicked.on.Ad__1) %>%
plot_correlation_funnel(interactive = TRUE)
Part (b)
lm_a = glm(Clicked.on.Ad~Daily.Internet.Usage+Daily.Time.Spent.on.Site+Age+Area.Income,data=df_ad, family = 'binomial')
summary(lm_a)
##
## Call:
## glm(formula = Clicked.on.Ad ~ Daily.Internet.Usage + Daily.Time.Spent.on.Site +
## Age + Area.Income, family = "binomial", data = df_ad)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4578 -0.1341 -0.0333 0.0167 3.1961
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.713e+01 2.714e+00 9.995 < 2e-16 ***
## Daily.Internet.Usage -6.391e-02 6.745e-03 -9.475 < 2e-16 ***
## Daily.Time.Spent.on.Site -1.919e-01 2.066e-02 -9.291 < 2e-16 ***
## Age 1.709e-01 2.568e-02 6.655 2.83e-11 ***
## Area.Income -1.354e-04 1.868e-05 -7.247 4.25e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1386.3 on 999 degrees of freedom
## Residual deviance: 182.9 on 995 degrees of freedom
## AIC: 192.9
##
## Number of Fisher Scoring iterations: 8
Now using the caret package, create a confusion matrix for the model predictions and actual clicks. Note you do not need to graph or plot this confusion matrix. How many false-negative occurrences do you observe? Recall false negative means the instances where the model predicts the case to be false when in reality it is true. For this example, this refers to cases where the ad is clicked but the model predicts that it isn’t
df_ad$predict_prob <- predict(lm_a, df_ad, type='response')
df_predict <-df_ad %>%
mutate(predict_click = if_else(predict_prob>0.8,1,0)) %>%
mutate(predict_click = as.factor(predict_click))
confusionMatrix(df_predict$predict_click,df_predict$Clicked.on.Ad)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 497 36
## 1 3 464
##
## Accuracy : 0.961
## 95% CI : (0.9471, 0.9721)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.922
##
## Mcnemar's Test P-Value : 2.99e-07
##
## Sensitivity : 0.9940
## Specificity : 0.9280
## Pos Pred Value : 0.9325
## Neg Pred Value : 0.9936
## Prevalence : 0.5000
## Detection Rate : 0.4970
## Detection Prevalence : 0.5330
## Balanced Accuracy : 0.9610
##
## 'Positive' Class : 0
##
Answer: The false negative is reference as 1 and prediction as 0. Based on the matrix, the false negative case is 36.